clear
set more off
set mem 1g
# delimit ;
/*****************************************************************************************************************;
**This dofile uses the GPS coordinates from the 1999, 2004 and 2007 clusters to assign each 1999 and 2007 cluster
**the cluster ID of the nearest 2004 cluster;

Pre-generated datasets this dofile also uses:
BangladeshGPS_2004dropped

Raw datasets used
BangladeshGPS_2007
BangladeshGPS_1999
*****************************************************************************************************************/;

*****************************************************************************************************************;
*Paths;
*****************************************************************************************************************;

*access files in subdirectory generated by data creating dofiles;
local indataset2004 "dtafiles/BangladeshGPS_2004dropped";
local indataset2007 "rawdta/DHS2007/BangladeshGPS_2007";
local indataset1999 "rawdta/DHS1999/BangladeshGPS_1999";

local outdataset "dtafiles/DHSGPS_matchedwith2004.dta";

*****************************************************************************************************************;
*Main;
*****************************************************************************************************************;

*including 2004 is a bit redundant, but helpful to have a complete set of clusters;
use "`indataset2004'";

gen dhsid04=real(substr(dhsid,-3,3));
ren dhsyear year; 
ren latnum latnum2004;
ren longnum longnum2004;

keep dhsid04 year latnum2004 longnum2004; 

tempfile 2004;
save `2004', replace;


use "`indataset2007'";;
gen dhsidunified=_n;
sort dhsidunified; 

gen dhsid07=real(substr(dhsid,-3,3));
ren dhsyear dhsyear2007; 
ren latnum latnum2007;
ren longnum longnum2007;

keep dhsid07 dhsidunified dhsyear2007 latnum2007 longnum2007; 

count;
local num2007=r(N);

tempfile 2007temp;
save `2007temp', replace;

use "`indataset2004'", clear;
gen dhsid04=real(substr(dhsid,-3,3));
compress;
expand `num2007';

bysort dhsid: gen dhsidunified=_n;
sort dhsidunified;
keep dhsidunified dhsid04 dhsyear latnum longnum;

merge m:1 dhsidunified using `2007temp';

vincenty latnum2007 longnum2007 latnum longnum, h(distance); 
egen mindistance07=min(distance),by(dhsid07);
keep if round(distance,.0001)==round(mindistance,.0001);

keep dhsid04 dhsid07;

gen year=2007;
tempfile 2007;
save `2007', replace;

use "`indataset1999'", clear;
gen dhsidunified=_n;
sort dhsidunified; 

gen dhsid99=real(substr(dhsid,-3,3));
ren dhsyear dhsyear1999; 
ren latnum latnum1999;
ren longnum longnum1999;

keep dhsid99 dhsidunified dhsyear1999 latnum1999 longnum1999; 

count;
local num1999=r(N);

tempfile 1999temp;
save `1999temp';

use "`indataset2004'", clear;
gen dhsid04=real(substr(dhsid,-3,3));
compress;
expand `num1999';

bysort dhsid: gen dhsidunified=_n;
sort dhsidunified;
keep dhsidunified dhsid04 dhsyear latnum longnum;

merge m:1 dhsidunified using `1999temp';

vincenty latnum1999 longnum1999 latnum longnum, h(distance); 
egen mindistance=min(distance),by(dhsid99);
keep if round(distance,.0001)==round(mindistance,.0001);

keep dhsid04 dhsid99;

gen year=1999;

append using "`2007'";
append using "`2004'";

keep year dhsid99 dhsid04 dhsid07;
gen dhsidyear=year*1000+dhsid07;
replace dhsidyear=year*1000+dhsid04 if year==2004;
replace dhsidyear=year*1000+dhsid99 if year==1999;

ren dhsid04 dhsid04matched;
label variable dhsid04matched "DHS cluster id in 2004, and closest 2004 cluster in 1999 and 2007";

keep dhsid04matched dhsidyear;
save "`outdataset'", replace;





